#!/usr/bin/python
#coding=utf8
import re
import os 
import time
import random
#import cPickle
import codecs
import sys
reload(sys)

class Baike:    
    def __init__(self):
        self.pcount = [0, 0, 0]  # baidu, wiki, hudong
        self.filecount = [0, 0, 0]  # baidu, wiki, hudong
        self.maxpages = 3000
        self.names = ['res/baidu', 'res/wiki', 'res/hudong']
        self.wikistr = '维基百科'#.decode('gbk').encode('utf8')
        self.hudongstr = '互动百科'#.decode('gbk').encode('utf8')
        
    def splitone(self, filename):
        infile = open(filename, 'r')
        onepage = []
        bkflag = -1
        for x in infile:
            if not x.strip():
                continue
            if x.strip() == "EEEEE":
                onepage = []
                bkflag = -1
            elif x.strip() == "SSSSS":
#                print bkflag
                self.dispone(onepage, bkflag)
                bkflag = -1
                onepage = []
            else:
                if bkflag < 0 and x.find('<title>')!=-1:
                    #print x
                    if x.find(self.hudongstr)!=-1:
                        bkflag = 2
                    elif x.find(self.wikistr)!=-1:
                        bkflag = 1
                    else:
                        bkflag = 0
                    self.pcount[bkflag] += 1
                onepage.append(x)
        infile.close()        
        
    def dispone(self, onepage, bkflag):
        
        if self.pcount[bkflag] > self.maxpages:
            self.filecount[bkflag] += 1
            self.pcount[bkflag] = 0
        print 'fc[%d]: %d'% (bkflag, self.filecount[bkflag])
        outfile = open(self.names[bkflag] + '_%04d'%self.filecount[bkflag] + '.txt', 'a')
        
        str = ''.join(onepage)
        outfile.write(str)
        outfile.write('\nEEEEE\n\nSSSSS\n')
        outfile.close()
        
    def doSplit(self):
        datadir = '/data/wwqa/baike/result/'
        files = [datadir+x for x in os.listdir(datadir) if x.find('page')!=-1]
        for afile in files:
            print afile
            self.splitone(afile)
                          
if __name__ == '__main__':
    #startURL = 'http://sports.sina.com.cn'
#    print sys.getdefaultencoding()
#    sys.setdefaultencoding('gbk')
#    print sys.getdefaultencoding()
    
    se = Baike()
    se.doSplit()
    print 'ok'
